{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# DB from https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset\n",
    "# By Stefano Leone - https://www.kaggle.com/stefanoleone992\n",
    "# Published under CC0 license."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gzip\n",
    "import os\n",
    "import pathlib\n",
    "import sqlite3\n",
    "from typing import Optional, Tuple\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sqlalchemy import (\n",
    "    Column, DateTime, Float, ForeignKey,\n",
    "    Integer, String, Table, create_engine,\n",
    ")\n",
    "from sqlalchemy.ext.declarative import declarative_base\n",
    "\n",
    "DB_PATH = 'imdb3.db'\n",
    "\n",
    "engine = create_engine(f'sqlite:///{DB_PATH}', echo=True)\n",
    "Base = declarative_base()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Country(Base):\n",
    "    __tablename__ = 'countries'\n",
    "\n",
    "    id = Column(Integer, primary_key=True)\n",
    "    name = Column(String, nullable=False)\n",
    "\n",
    "\n",
    "class Genre(Base):\n",
    "    __tablename__ = 'genres'\n",
    "\n",
    "    id = Column(Integer, primary_key=True)\n",
    "    name = Column(String, nullable=False)\n",
    "\n",
    "\n",
    "class Language(Base):\n",
    "    __tablename__ = 'languages'\n",
    "\n",
    "    id = Column(Integer, primary_key=True)\n",
    "    name = Column(String, nullable=False)\n",
    "\n",
    "\n",
    "movie_country = Table('movie_countries', Base.metadata,\n",
    "    Column('movie_id', String, ForeignKey('movies.id')),\n",
    "    Column('country_id', Integer, ForeignKey('countries.id'))\n",
    ")\n",
    "\n",
    "\n",
    "movie_genre = Table('movie_genres', Base.metadata,\n",
    "    Column('movie_id', String, ForeignKey('movies.id')),\n",
    "    Column('genre_id', Integer, ForeignKey('genres.id'))\n",
    ")\n",
    "\n",
    "\n",
    "movie_language = Table('movie_languages', Base.metadata,\n",
    "    Column('movie_id', String, ForeignKey('movies.id')),\n",
    "    Column('language_id', Integer, ForeignKey('languages.id'))\n",
    ")\n",
    "\n",
    "    \n",
    "class Movie(Base):\n",
    "    __tablename__ = 'movies'\n",
    "\n",
    "    id = Column(String, primary_key=True)\n",
    "    title = Column(String, nullable=False)\n",
    "    original_title = Column(String, nullable=False)\n",
    "    year = Column(Integer, nullable=False)\n",
    "    avg_vote = Column(String, nullable=False)\n",
    "    votes = Column(Integer, nullable=False)\n",
    "    duration = Column(Integer, nullable=False)\n",
    "    budget = Column(String)\n",
    "    gross_income = Column(String)\n",
    "\n",
    "    \n",
    "class Name(Base):\n",
    "    __tablename__ = 'names'\n",
    "\n",
    "    id = Column(String, primary_key=True)\n",
    "    name = Column(String, nullable=False)\n",
    "    height = Column(Float)\n",
    "    date_of_birth = Column(DateTime, nullable=True)\n",
    "    date_of_death = Column(DateTime, nullable=True)\n",
    "    children = Column(Integer, nullable=False)\n",
    "\n",
    "\n",
    "class Principal(Base):\n",
    "    __tablename__ = 'principals'\n",
    "\n",
    "    movie_id = Column(String, ForeignKey('movies.id'), primary_key=True)\n",
    "    ordering = Column(String, primary_key=True)\n",
    "    name_id = Column(String, ForeignKey('names.id'), nullable=False)\n",
    "    job_id = Column(Integer, ForeignKey('jobs.id'), nullable=False)\n",
    "    characters = Column(String)\n",
    "\n",
    "\n",
    "class Job(Base):\n",
    "    __tablename__ = 'jobs'\n",
    "\n",
    "    id = Column(Integer, primary_key=True)\n",
    "    name = Column(String, nullable=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Base.metadata.create_all(engine)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "CREATE_TABLE_SETTINGS = {'con': engine, 'if_exists': 'append'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = pd.read_csv(\n",
    "    'names.csv',\n",
    "    parse_dates=['date_of_birth', 'date_of_death'],\n",
    "    infer_datetime_format=True,\n",
    "    na_values=\"None\",\n",
    ")[\n",
    "    ['imdb_name_id', 'name', 'height',\n",
    "     'date_of_birth', 'date_of_death', 'children']\n",
    "]\n",
    "\n",
    "principals = pd.read_csv(\n",
    "    'title_principals.csv', na_values=\"None\",\n",
    ")[['imdb_title_id', 'ordering', 'imdb_name_id', 'category', 'characters']]\n",
    "\n",
    "movies = pd.read_csv(\n",
    "    'movies.csv', parse_dates=['date_published'], na_values=\"None\",\n",
    ")[\n",
    "    ['imdb_title_id', 'title', 'original_title', 'year', 'genre',\n",
    "     'duration', 'avg_vote', 'votes', 'country', 'language', 'budget',\n",
    "     'worlwide_gross_income']\n",
    "]\n",
    "\n",
    "\n",
    "movies.rename(\n",
    "    columns={'imdb_title_id': 'id', 'worlwide_gross_income': 'gross_income'},\n",
    "    inplace=True,\n",
    ")\n",
    "principals.rename(\n",
    "    columns={'imdb_title_id': 'movie_id', 'imdb_name_id': 'name_id',\n",
    "             'category': 'job_id'},\n",
    "    inplace=True,\n",
    ")\n",
    "names.rename(columns={'imdb_name_id': 'id'}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "names['date_of_birth'] = pd.to_datetime(names['date_of_birth'], errors='coerce')\n",
    "names['date_of_death'] = pd.to_datetime(names['date_of_death'], errors='coerce')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "movies.set_index('id', inplace=True)\n",
    "names.set_index('id', inplace=True)\n",
    "principals.set_index(['movie_id', 'ordering'], inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tables = {\n",
    "    'names': names,\n",
    "    'principals': principals,\n",
    "    'movies': movies,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create M2Ms\n",
    "\n",
    "TO_CONVERT = [\n",
    "    ('movies', 'genre', 'genres'),\n",
    "    ('movies', 'language', 'languages'),\n",
    "    ('movies', 'country', 'countries'),\n",
    "]\n",
    "\n",
    "for t, c, many in TO_CONVERT:\n",
    "    values = {\n",
    "        value.strip()\n",
    "        for vals in tables[t][c].str.split(',')\n",
    "        for value in (vals if isinstance(vals, list) else [])\n",
    "    }\n",
    "\n",
    "    second_table = list(enumerate(values, 1))\n",
    "    tables[many] = pd.DataFrame(second_table, columns=['id', 'name'])\n",
    "    tables[many].set_index('id', inplace=True)\n",
    "\n",
    "    # Create the M2M relationships\n",
    "    values_kv = {v: k for k, v in second_table}\n",
    "    titles_value = (\n",
    "        (i, values_kv.get(value))\n",
    "        for i, title in tables[t].iterrows() if isinstance(title[c], str)\n",
    "        for value in map(str.strip, title[c].split(','))\n",
    "    )\n",
    "\n",
    "    tables[f'movie_{many}'] = pd.DataFrame(titles_value, columns=['movie_id', f'{c}_id'])\n",
    "    tables[t].drop([c], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "jobs = list(enumerate(principals['job_id'].unique(), 1))\n",
    "tables['jobs'] = pd.DataFrame(jobs, columns=['id', 'name'])\n",
    "for job_id, job_name in jobs:\n",
    "    tables['principals'].replace({job_name: job_id}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "delete_chars = str.maketrans(\"\", \"\", \"[]\\\"\")\n",
    "\n",
    "tables['principals']['characters'] = (\n",
    "    tables['principals']['characters'].astype(str)\n",
    "    .str.translate(delete_chars)\n",
    "    .replace(',', ', ').replace('nan', np.nan)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "WITHOUT_INDEX = {'movie_genres', 'movie_languages', 'movie_countries', 'jobs'}\n",
    "\n",
    "for name, df in tables.items():\n",
    "    CREATE_TABLE_SETTINGS['index'] = name not in WITHOUT_INDEX\n",
    "    print(name, CREATE_TABLE_SETTINGS)\n",
    "    df.to_sql(name, **CREATE_TABLE_SETTINGS)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
